DATA PREPARATION

PRIMATES’ NEOPLASY PROJECT

MIGUEL RAMON ALONSO

knitr::opts_knit$set(root.dir = "/home/rstudio/NEOPLASY_PRIMATES")
knitr::opts_chunk$set(engine.path = list(
python = '/home/rstudio/NEOPLASY_PRIMATES/TreeCluster/venv/bin/python'))
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES"

Directory definition

workingDir <- getwd()
dataDir <- file.path(workingDir, "Data")
resultsDir <- file.path(workingDir,"Out") 
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES"

Seed and library loading

set.seed(1998)

# General use libraries
library(phytools)
library(dplyr)
library(castor)
library(tidyr)

# Plotting
library(ggplot2)
library(ggtree)

# Colors
library(microViz)

Tree import

primates_tree <- read.newick(file.path(dataDir,"233-GENOMES/science.abn7829_data_s4.nex.tree"))

# Give a name to species name attribute
tree_species <- primates_tree$tip.label

Cancer traitfile import

cancer_traits <- read.csv("Data/Neoplasia/species360_primates_neoplasia_20230519.csv", sep = ",")

# Remove whitespaces
cancer_traits[] <- lapply(cancer_traits, function(col) trimws(col))

# binarize
cancer_traits$label <- gsub(" ", "_", cancer_traits$species)

# Reorder and filter species
cancer_traits <- cancer_traits %>%
  select(family,species,label,common_name,necropsy_count,neoplasia_necropsy,neoplasia_prevalence, everything()) %>%
  filter(!is.na(neoplasia_prevalence))

Cancer trait file polishing

Remove spaces at the end, work out name dissimilarities and stuff like that

common_names <- intersect(primates_tree$tip.label, cancer_traits$label)

direct_matches <- cancer_traits$label %in% tree_species
cancer_species_to_check <- cancer_traits$label[!direct_matches]

tree <- drop.tip(primates_tree, setdiff(primates_tree$tip.label, common_names))

### Mantain structure

cancer_traits <- cancer_traits %>%
  mutate(ordering = match(label, tree$tip.label)) %>%
  arrange(ordering)  %>%
  filter(!is.na(ordering))

write.tree(tree, file="Out/1.Pruning/tree_pruning/science.abn7829_data_s4.nex.tree.pruned")

Dissimilarities between the two namesets (tree and traits)

Clustering by branch depth

Revised to do the exploration node-wise
# Get distances from all nodes to root
sorted_distances <- sort(get_all_distances_to_root(tree))

# Identify duplicates rounded to 5 decimal places (for significance)
duplicates <- duplicated(round(unlist(sorted_distances), 5))

# Remove duplicates
unique_distances <- sorted_distances[!duplicates]

# Cut the tree using TreeCluster (bash)
for (index in seq_along(unique_distances)) {
  node <- unique_distances[[index]]
  
  # Visualice the node
  # echo_command <- sprintf("echo %f", node)
  # system(echo_command)
    
  # Cut the tree by every possible nodal distance and collect the clusters in separated files for further use
  output_path <- sprintf("Out/2.Clustering/tree_clustering/%d_cut_%f.clusters", index, node)
  tree_cluster_command <- sprintf("python3 TreeCluster/TreeCluster.py -i Out/1.Pruning/tree_pruning/science.abn7829_data_s4.nex.tree.pruned -t %f --method root_dist -o %s", node, output_path)
  system(tree_cluster_command)
}

Now lets merge all the clusters into a single dataframe, including also the families

# Identify all cluster files
cluster_files <- list.files(path = "Out/2.Clustering/tree_clustering/", full.names = TRUE)

# Function to read and process a single cluster file
process_cluster_file <- function(file_path) {
  # Extract just the filename without the extension
  filename_no_ext <- tools::file_path_sans_ext(basename(file_path))
  
  # Extract the desired portion for the header by splitting on '_'
  components <- strsplit(filename_no_ext, "_")[[1]]
  cut_point <- paste0(components[length(components) - 1], "_", tail(components, 1))
  
  # Read the file into a dataframe
  df <- read.csv(file_path, sep = "\t", stringsAsFactors = FALSE)
  
  # Rename columns
  colnames(df)[colnames(df) == "SequenceName"] <- "label"
  colnames(df)[colnames(df) == "ClusterNumber"] <- cut_point
  
  return(df)
}

# Process all cluster files
list_of_dfs <- lapply(cluster_files, process_cluster_file)

# Merge all data frames by SequenceName
clusters_df <- Reduce(function(x, y) {
  merge(x, y, by = "label", all.y = TRUE, no.dups = TRUE)
}, list_of_dfs)

# Keep the SPECIES_BINOMIAL column fixed and order the rest
ordered_columns <- c("label", rev(colnames(clusters_df)[-1]))

# Reorder the columns based on the sorted column names
clusters_df <- clusters_df[, ordered_columns]

# Remove duplicated cuts
## This removes redundancy, as different, very close cuts have exactly the same clusters
clusters_unique <- clusters_df[!duplicated(t(clusters_df))]

# Add families from traitfile
merged_traits <- merge(clusters_unique, cancer_traits, by="label", all.x=TRUE)
clusters_unique <- cbind(clusters_unique[,1], merged_traits$family, clusters_unique[,-1])

colnames(clusters_unique)[1] <- "species"
colnames(clusters_unique)[2] <- "family"

# Recover the cluster file por posterior use
write.csv(clusters_unique, file="Out/2.Clustering/traits_clusters_exploration/clusters_unique.csv")

Create expanded clusters file for further analyses

In order to include species which are not present in the phylogeny but might be interesting to see how the phenotype is distributed, add them to an expanded data frame, replicating clustering data from closest individual
clusters_unique_expanded <- clusters_unique

# Extract the genus from SPECIES_BINOMIAL
clusters_unique_expanded$GENUS <- sapply(strsplit(as.character(clusters_unique_expanded$species), "_"), `[`, 1)
genus_to_check <- sapply(strsplit(common_names, "_"), `[`, 1)

# Loop through cancer_species_to_check and match with merged_traits
for(name in cancer_species_to_check){
  
  print(name)
  
  # Find the genus of the species
  genus <- strsplit(name, "_")[[1]][1]
  
  # Check if the genus exists in merged_traits
  matching_index <- which(clusters_unique_expanded$GENUS == genus)
  
  # If a match is found, duplicate the row and modify the SPECIES_BINOMIAL
  if(length(matching_index) > 0){
    print(paste(name,"matches!"))
    new_row <- clusters_unique_expanded[matching_index[1],]
    new_row$species <- name
    clusters_unique_expanded <- rbind(clusters_unique_expanded, new_row)
  }
}
## [1] "Ateles_fusciceps"
## [1] "Ateles_fusciceps matches!"
## [1] "Cebus_capucinus"
## [1] "Macaca_sylvanus"
## [1] "Macaca_sylvanus matches!"
## [1] "Nomascus_leucogenys"
## [1] "Pithecia_pithecia"
## [1] "Plecturocebus_donacophilus"
## [1] "Saguinus_leucopus"
## [1] "Saguinus_leucopus matches!"
## [1] "Saimiri_boliviensis"
## [1] "Saimiri_boliviensis matches!"
## [1] "Symphalangus_syndactylus"
# Remove genus column
clusters_unique_expanded$GENUS <- NULL

# Recover the cluster file por posterior use
write.csv(clusters_unique_expanded, file="Out/2.Clustering/traits_clusters_exploration/clusters_unique_expanded.csv")

Visualizing cluster distribution with species present in the phylogeny and trait values

# Extracting tip labels order from the phylo object
ordered_species <- rev(tree$tip.label)

# Create a new factor with the desired order for SPECIES_BINOMIAL
clusters_unique$species <- factor(clusters_unique$species, levels = ordered_species)

# Convert the data frame from wide to long format
clusters_long <- clusters_unique %>%
  gather(key = "cut", value = "cluster", starts_with("cut"))

# Add a cut for FAMILY to visualize it alongside the other cuts
clusters_long <- rbind(clusters_long, data.frame(species = clusters_unique$species, cut = "FAMILY", cluster = as.character(clusters_unique$family), family = clusters_unique$family))

# Reorder the levels of the cut factor to place FAMILY first
ordered_cuts <- c("FAMILY", sort(unique(clusters_long$cut[clusters_long$cut != "FAMILY"])))
clusters_long$cut <- factor(clusters_long$cut, levels = ordered_cuts)

# Calculate the number of unique cluster values and families
num_clusters <- length(unique(clusters_long$cluster))
num_families <- length(unique(clusters_unique$family))


# Combine the colors and levels
all_colors <- distinct_palette(25)
all_levels <- unique(c(as.character(clusters_unique$family), as.character(clusters_long$cluster)))

# Plot

heatmap_plot <- ggplot(clusters_long, aes(x = cut, y = species)) +
  geom_tile(aes(fill = cluster), color = "white", linewidth = 0.5) +
  scale_fill_manual(values = setNames(all_colors, all_levels)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(y = "Species", x = "Cut & Family", fill = "Value/Group") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, size = 14),
    axis.text.y = element_text(size = 14, hjust = 0.5),
    axis.title.x = element_text(size = 16, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 16, angle = 90,hjust = 0.5, margin = margin(t = 20, b = 10)),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12))
    
heatmap_plot

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp.svg", plot = heatmap_plot, width = 18, height = 18, device = "svg")

Visualizing cluster distribution with species not present in the phylogeny

# Factorice and order by family
clusters_unique_expanded$family <- as.factor(sort(clusters_unique_expanded$family))

# Convert the data frame from wide to long format
clusters_expanded_long <- clusters_unique_expanded %>%
  gather(key = "cut", value = "cluster", starts_with("cut"))

# Add a cut for FAMILY to visualize it alongside the other cuts
clusters_expanded_long <- rbind(clusters_expanded_long, data.frame(species = clusters_unique_expanded$species, cut = "FAMILY", cluster = as.character(clusters_unique_expanded$family), family = clusters_unique_expanded$family))

# Reorder the levels of the cut factor to place FAMILY first
ordered_cuts <- c("FAMILY", sort(unique(clusters_expanded_long$cut[clusters_expanded_long$cut != "FAMILY"])))
clusters_expanded_long$cut <- factor(clusters_expanded_long$cut, levels = ordered_cuts)

# Calculate the number of unique cluster values and families
num_clusters <- length(unique(clusters_expanded_long$cluster))
num_families <- length(unique(clusters_unique_expanded$FAMILY))


# Combine the colors and levels
all_colors <- distinct_palette(25)
all_levels <- unique(c(as.character(clusters_unique$family), as.character(clusters_long$cluster)))

# Plot
heatmap_expanded_plot <- ggplot(clusters_expanded_long, aes(x = cut, y = species)) +
  geom_tile(aes(fill = cluster), color = "white", linewidth = 0.5) +
  scale_fill_manual(values = setNames(all_colors, all_levels)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(y = "Species", x = "Cut & Family", fill = "Value/Group") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, size = 14),
    axis.text.y = element_text(size = 14, hjust = 0.5),
    axis.title.x = element_text(size = 16, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 16, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 10)),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12))
    
print(heatmap_expanded_plot)

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_outspecies.svg", plot = heatmap_expanded_plot, width = 18, height = 18, device = "svg")

Plot together our heatmap with the phylogenetic tree

tree <- ape::rotateConstr(tree, ordered_species)
tree_plot <- ggtree(tree, ladderize = FALSE)

combined_plot <- tree_plot + heatmap_plot

# still same problemo

combined_plot

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_tree.png", plot = combined_plot, width = 36, height = 18, dpi = 300)

ggsave(filename = "Out/2.Clustering/clustering_family_comparison/cluster_family_comp_w_tree.svg", plot = combined_plot, width = 36, height = 18, device = "svg")